/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2013 Regents of the University of California
 */

.global memcpy
.type memcpy,@function
#ifndef __riscv_vector
#define SZREG 8
#define REG_S sd
#define REG_L ld
#endif

memcpy:
#ifdef __riscv_vector
        mv 	a3, a0 # Copy destination
  loop:
        vsetvli t0, a2, e8, m8, ta, ma   # Vectors of 8b
        vle8.v 	v0, (a1)               # Load bytes
        add 	a1, a1, t0              # Bump pointer
        sub 	a2, a2, t0              # Decrement count
        vse8.v 	v0, (a3)               # Store bytes
        add 	a3, a3, t0              # Bump pointer
        bnez 	a2, loop               # Any more?
        ret                         # Return
#else
        /* Save for return value */
        mv      t6, a0

        /*
         * Register allocation for code below:
         * a0 - start of uncopied dst
         * a1 - start of uncopied src
         * t0 - end of uncopied dst
         */
        add     t0, a0, a2

        /*
         * Use bytewise copy if too small.
         *
         * This threshold must be at least 2*SZREG to ensure at least one
         * wordwise copy is performed. It is chosen to be 16 because it will
         * save at least 7 iterations of bytewise copy, which pays off the
         * fixed overhead.
         */
        li      a3, 16
        bltu    a2, a3, .Lbyte_copy_tail

        /*
         * Bytewise copy first to align a0 to word boundary.
         */
        addi    a2, a0, SZREG-1
        andi    a2, a2, ~(SZREG-1)
        beq     a0, a2, 2f
1:
        lb      a5, 0(a1)
        addi    a1, a1, 1
        sb      a5, 0(a0)
        addi    a0, a0, 1
        bne     a0, a2, 1b
2:

        /*
         * Now a0 is word-aligned. If a1 is also word aligned, we could perform
         * aligned word-wise copy. Otherwise we need to perform misaligned
         * word-wise copy.
         */
        andi    a3, a1, SZREG-1
        bnez    a3, .Lmisaligned_word_copy

        /* Unrolled wordwise copy */
        addi    t0, t0, -(16*SZREG-1)
        bgeu    a0, t0, 2f
1:
        REG_L   a2,        0(a1)
        REG_L   a3,    SZREG(a1)
        REG_L   a4,  2*SZREG(a1)
        REG_L   a5,  3*SZREG(a1)
        REG_L   a6,  4*SZREG(a1)
        REG_L   a7,  5*SZREG(a1)
        REG_L   t1,  6*SZREG(a1)
        REG_L   t2,  7*SZREG(a1)
        REG_L   t3,  8*SZREG(a1)
        REG_L   t4,  9*SZREG(a1)
        REG_L   t5, 10*SZREG(a1)
        REG_S   a2,        0(a0)
        REG_S   a3,    SZREG(a0)
        REG_S   a4,  2*SZREG(a0)
        REG_S   a5,  3*SZREG(a0)
        REG_S   a6,  4*SZREG(a0)
        REG_S   a7,  5*SZREG(a0)
        REG_S   t1,  6*SZREG(a0)
        REG_S   t2,  7*SZREG(a0)
        REG_S   t3,  8*SZREG(a0)
        REG_S   t4,  9*SZREG(a0)
        REG_S   t5, 10*SZREG(a0)
        REG_L   a2, 11*SZREG(a1)
        REG_L   a3, 12*SZREG(a1)
        REG_L   a4, 13*SZREG(a1)
        REG_L   a5, 14*SZREG(a1)
        REG_L   a6, 15*SZREG(a1)
        addi    a1, a1, 16*SZREG
        REG_S   a2, 11*SZREG(a0)
        REG_S   a3, 12*SZREG(a0)
        REG_S   a4, 13*SZREG(a0)
        REG_S   a5, 14*SZREG(a0)
        REG_S   a6, 15*SZREG(a0)
        addi    a0, a0, 16*SZREG
        bltu    a0, t0, 1b
2:
        /* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */
        addi    t0, t0, 15*SZREG

        /* Wordwise copy */
        bgeu    a0, t0, 2f
1:
        REG_L   a5, 0(a1)
        addi    a1, a1, SZREG
        REG_S   a5, 0(a0)
        addi    a0, a0, SZREG
        bltu    a0, t0, 1b
2:
        addi    t0, t0, SZREG-1

.Lbyte_copy_tail:
        /*
         * Bytewise copy anything left.
         */
        beq     a0, t0, 2f
1:
        lb      a5, 0(a1)
        addi    a1, a1, 1
        sb      a5, 0(a0)
        addi    a0, a0, 1
        bne     a0, t0, 1b
2:

        mv      a0, t6
        ret

.Lmisaligned_word_copy:
        /*
         * Misaligned word-wise copy.
         * For misaligned copy we still perform word-wise copy, but we need to
         * use the value fetched from the previous iteration and do some shifts.
         * This is safe because we wouldn't access more words than necessary.
         */

        /* Calculate shifts */
        slli    t3, a3, 3
        sub     t4, x0, t3 /* negate is okay as shift will only look at LSBs */

        /* Load the initial value and align a1 */
        andi    a1, a1, ~(SZREG-1)
        REG_L   a5, 0(a1)

        addi    t0, t0, -(SZREG-1)
        /* At least one iteration will be executed here, no check */
1:
        srl     a4, a5, t3
        REG_L   a5, SZREG(a1)
        addi    a1, a1, SZREG
        sll     a2, a5, t4
        or      a2, a2, a4
        REG_S   a2, 0(a0)
        addi    a0, a0, SZREG
        bltu    a0, t0, 1b

        /* Update pointers to correct value */
        addi    t0, t0, SZREG-1
        add     a1, a1, a3

        j       .Lbyte_copy_tail
#endif
